> library(knitr)
> library(extrafont)
> library(stringr)
> library(ggplot2)
> library(dplyr)
> library(gplots)
>
> # remove scientific notation unless the numbers get really big
> options(scipen = 5)
>
> experiment = c(rep("P10_control", 6), rep("P10_rescue", 6), rep("P13_control",
+ 6), rep("P13_rescue", 6))
> experiment = rep(experiment, 16)
> age = unname(sapply(experiment, function(x) {
+ strsplit(x, "_")[[1]][1]
+ }))
> status = unname(sapply(experiment, function(x) {
+ strsplit(x, "_")[[1]][2]
+ }))
> metadata = data.frame(experiment = experiment, age = age, status = status)
>
> umi_count_file = "../data/retinaAligned.out.cleaned.counts_umi.gz"
>
>
> umi_counts = function(umi_count_file) {
+ plate = tbl_df(read.table(gzfile(umi_count_file), sep = "\t"))
+ colnames(plate) = c("umi", "gene", "cell", "counts")
+ byumi = plate %>% group_by(umi) %>% summarise(sum = sum(counts))
+ byumi = subset(byumi, !grepl("N", umi))
+ byumi$umi = as.character(byumi$umi)
+ return(byumi)
+ }
There is not a huge skew in UMI distribution towards G-repeats that we have seen with some other libraries, the top UMI tags look pretty reasonable:
> umi = umi_counts(umi_count_file)
> umi = umi[order(-umi$sum), ]
> kable(head(umi, 25), format = "html")
| umi | sum |
|---|---|
| ACAAGTGAAA | 14760 |
| GGGGTAGAAT | 14117 |
| GCGTGTGCTA | 13299 |
| GGATGATTGT | 12616 |
| GGGACGATTT | 12597 |
| CGCCAACAGT | 12408 |
| CGTTACTTGC | 11713 |
| GGCAGGTCAA | 11631 |
| CGGGGAGCGT | 11578 |
| AAATAGAAAC | 11346 |
| GCGGAGAAGA | 11338 |
| AGCCCGTCGT | 10981 |
| GATGAGGGGT | 10958 |
| TGGGGGGTGG | 10793 |
| GCTCTCATGG | 10762 |
| AACGGTAGGG | 10698 |
| AAACTGTACA | 10689 |
| TGGAGCTGAG | 10671 |
| GGGGGCTTTA | 10610 |
| TATGAAAGGC | 10575 |
| GTGCACAGGG | 10505 |
| ACAGGAGGCG | 10354 |
| TTCTGTAGAG | 10161 |
| GCTGTTGTCG | 10135 |
| GGACGACGGA | 10134 |
The UMI content is slightly skewed, with UMI with a higher GC content more likely to be sequenced:
> umi$G = str_count(umi$umi, "G")
> umi$T = str_count(umi$umi, "T")
> umi$C = str_count(umi$umi, "C")
> umi$A = str_count(umi$umi, "A")
> umi$GC = (umi$G + umi$C)/(umi$G + umi$C + umi$T + umi$A)
> ggplot(umi, aes(as.factor(GC), sum)) + geom_boxplot() + scale_y_log10() + xlab("GC content") +
+ ylab("reads mapped") + theme_bw(base_size = 12, base_family = "Gill Sans MT") +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"))
> sort_samplenames = function(samplenames) {
+ sample = unlist(lapply(str_split(samplenames, "[.]"), function(x) {
+ x[1]
+ }))
+ ends = unlist(lapply(str_split(samplenames, "[.]"), function(x) {
+ x[2]
+ }))
+ plate = unlist(lapply(str_split(samplenames, "_"), function(x) {
+ x[1]
+ }))
+ well = unlist(lapply(str_split(samplenames, "_"), function(x) {
+ x[2]
+ }))
+ row = substr(well, 1, 1)
+ plate = as.numeric(substr(well, 2, 3))
+ df = data.frame(id = samplenames, row = row, plate = plate)
+ ord = df[order(df$row, df$plate), ]
+ return(as.character(ord$id))
+ }
>
> counts_from_umi = function(umi_count_file, tag) {
+ plate = tbl_df(read.table(gzfile(umi_count_file), sep = "\t"))
+ colnames(plate) = c("umi", "gene", "cell", "counts")
+ byumi = plate %>% group_by(umi) %>% summarise(sum = sum(counts))
+ umi_to_keep = subset(byumi, !grepl("N", umi))$umi
+ umi_filter = subset(plate, plate$umi %in% umi_to_keep)
+ umi_filter$counts_umi = 1
+ bygene_sample = umi_filter %>% group_by(gene, cell) %>% summarise(sum = sum(counts_umi))
+ bygene_sample = data.frame(bygene_sample)
+ counts = reshape(bygene_sample, timevar = "cell", idvar = "gene", direction = "wide")
+ counts[is.na(counts)] = 0
+ row.names(counts) = counts$gene
+ counts$gene = NULL
+ colnames(counts) = lapply(colnames(counts), function(x) {
+ paste(tag, strsplit(x, "sum.")[[1]][2], sep = ".")
+ })
+ counts = counts[order(rownames(counts)), sort_samplenames(colnames(counts))]
+ return(counts)
+ }
>
> umi = counts_from_umi(umi_count_file, "retina")
> colnames(umi) = unname(sapply(colnames(umi), function(x) {
+ strsplit(x, "_")[[1]][2]
+ }))
> metadata$well = colnames(umi)
> metadata$row = substr(colnames(umi), 1, 1)
There are only a small number of genes detected in each well.
> qplot(colSums(umi > 0)) + geom_histogram() + xlab("genes detected per well") +
+ theme_bw(base_size = 12, base_family = "Gill Sans MT") + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"))
There are only a small number of counts per well, less than 1000:
> summary(colSums(umi))
Min. 1st Qu. Median Mean 3rd Qu. Max.
12.0 172.5 404.5 705.1 747.0 11630.0
> qplot(colSums(umi)) + geom_histogram() + xlab("total counts per well") + theme_bw(base_size = 12,
+ base_family = "Gill Sans MT") + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"))
Most detected genes only have a small number of reads associated with them with a large number of genes just having one read of evidence.
> qplot(rowSums(umi)) + geom_histogram() + xlab("total reads per gene") + theme_bw(base_size = 12,
+ base_family = "Gill Sans MT") + scale_x_log10() + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"))
The P10 and P13 cells have different distributions of counts which will confound things somewhat:
> df = data.frame(well = colnames(umi), counts = colSums(umi))
> m = merge(df, metadata, by = "well", all.x = TRUE)
> ggplot(m, aes(well, counts)) + geom_bar(stat = "identity") + facet_wrap(~experiment) +
+ ylab("counts per well") + theme_bw(base_size = 12, base_family = "Gill Sans MT") +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.ticks.x = element_blank(),
+ axis.text.x = element_blank())
There isn’t really any clustering of the samples based on age or rescue status, which is not good if we are hoping to call anything DE.
> library(CHBUtils)
> mds(cor(umi), condition = experiment)
> mds(cor(umi), condition = metadata$row)
Hierarchical clustering shows this as well, the colors at the top are the different experiments. This data has each gene normalized by the maximum count for that gene.
> exp_colors = data.frame(experiment = c("P10_control", "P10_rescue", "P13_control",
+ "P13_rescue"), colors = c("blue", "red", "yellow", "green"))
> colors = merge(metadata, exp_colors, by = "experiment", all.x = TRUE)
>
> m = as.matrix(umi)
> heatmap(m/apply(m, 1, max), zlim = c(0, 1), col = gray.colors(100), Rowv = NA,
+ labRow = NA, scale = "none", ColSideColors = as.character(colors$colors))
Based on the above we shouldn’t really be expecting there to be any differences. I tried running SCDE for all the samples, P10 vs P13 and rescue vs control and neither came out as anything. I also treated P10 and P13 as a batch effect and looked at the effect of rescue vs control and couldn’t find anything either.
> library(scde)
>
> scde_count_prep = function(counts, groups, batch = NULL, sample_min = 5000,
+ sample_max = Inf) {
+ keep_genes = rowSums(counts) > 0
+ keep_samples = (colSums(counts) > sample_min) & (colSums(counts) < sample_max)
+ groups = droplevels(groups[keep_samples])
+ counts = counts[keep_genes, keep_samples]
+ names(groups) = colnames(counts)
+ if (!any(unlist(lapply(batch, is.na)))) {
+ batch = droplevels(batch[keep_samples])
+ names(batch) = colnames(counts)
+ }
+ return(list(counts = counts, groups = groups, batch = batch))
+ }
>
>
> scde_de = function(scde) {
+ o.ifm = scde.error.models(counts = scde$counts, groups = scde$groups, n.cores = 1,
+ threshold.segmentation = TRUE, save.crossfit.plots = FALSE, save.model.plots = FALSE,
+ verbose = 2)
+ valid.cells = o.ifm$corr.a > 0
+ o.ifm = o.ifm[valid.cells, ]
+ groups = scde$groups[valid.cells]
+ names(groups) = row.names(o.ifm)
+ if (!any(unlist(lapply(scde$batch, is.na)))) {
+ batch = scde$batch[valid.cells]
+ names(batch) = row.names(o.ifm)
+ }
+ counts = scde$counts[, valid.cells]
+ o.prior = scde.expression.prior(models = o.ifm, counts = scde$counts, length.out = 400,
+ show.plot = T)
+ ediff = scde.expression.difference(o.ifm, scde$counts, o.prior, groups = groups,
+ batch = batch, n.randomizations = 100, verbose = 1)
+ if (!any(unlist(lapply(scde$batch, is.na)))) {
+ ediff = ediff$batch.adjusted
+ }
+
+ ediff$pvalue = pnorm(-(abs(ediff$Z))) * 2
+ ediff$padj = p.adjust(ediff$pvalue)
+ return(ediff)
+ }
> s = scde_count_prep(umi, metadata$status, metadata$age, 1000, 20000)
> d = scde_de(s)
cross-fitting cells.
building individual error models.
controlling for batch effects. interaction:
batch
groups P10 P13
control 28 6
rescue 28 5
calculating batch posteriors
calculating batch differences
calculating difference posterior
summarizing differences
adjusting for batch effects
> table(d$padj < 0.05)
FALSE
10082
> library(reshape)
> combined = read.table("../data/bulk/annotated_combined.counts", sep = "\t",
+ header = TRUE)
> combined$symbol = toupper(combined$symbol)
> summarydata = read.table("../data/bulk/project-summary.csv", sep = ",", header = TRUE)
>
> load_gene_list = function(in_fn) {
+ gene_list = read.table(in_fn, header = FALSE)
+ colnames(gene_list) = "symbol"
+ gene_list$symbol = toupper(gene_list$symbol)
+ return(gene_list)
+ }
> missing_genes = function(x, known) {
+ return(x[!toupper(x$symbol) %in% toupper(known$symbol), ])
+ }
>
> symbols_to_ensembl = function(x) {
+ require(biomaRt)
+ mart = useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")
+ x = getBM(mart, filters = "mgi_symbol", attributes = c("mgi_symbol", "ensembl_gene_id"),
+ values = x)
+ return(x)
+ }
>
> unigene_to_ensembl = function(x) {
+ require(biomaRt)
+ mart = useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")
+ x = getBM(mart, filters = "unigene", attributes = c("unigene", "mgi_symbol",
+ "ensembl_gene_id"), values = x)
+ return(x)
+ }
>
> get_go_terms = function() {
+ require(biomaRt)
+ mart = useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")
+ x = getBM(mart = mart, attributes = c("ensembl_gene_id", "mgi_symbol", "go_id",
+ "name_1006"))
+ return(x)
+ }
>
> go_terms = get_go_terms()
> apoptosis_list = go_terms[grepl("apopto", go_terms$name_1006), ]
> colnames(apoptosis_list) = c("id", "symbol")
> housekeeping_list = read.table("../metadata/housekeeping.txt")
> colnames(housekeeping_list) = c("unigene")
> housekeeping_list = unigene_to_ensembl(housekeeping_list$unigene)
> colnames(housekeeping_list) = c("unigene", "symbol", "id")
>
> rod_gene_list = read.table("../metadata/rod_gene_list.txt", header = TRUE)
> glycolysis_list = read.table("../metadata/glycolysis.txt", header = TRUE, sep = "\t")
> calcium_list = read.table("../metadata/calcium_signaling_pathway.txt", header = TRUE,
+ sep = "\t")
> nrf2_list = read.table("../metadata/oxstressnrf2.txt", header = TRUE, sep = "\t")
> hif1_list = read.table("../metadata/hif1a.txt", header = TRUE, sep = "\t")
> oxidative_list = read.table("../metadata/oxidative_stress.txt", sep = "\t",
+ header = TRUE)
>
> normalize_bulk_counts = function(counts) {
+ library(edgeR)
+ counts = combined
+ rownames(counts) = counts$id
+ counts$id = NULL
+ counts$symbol = NULL
+ y = DGEList(counts = counts)
+ y = calcNormFactors(y)
+ counts = cpm(y, normalized.lib.sizes = TRUE)
+ col_order = order(colnames(counts))
+ counts = counts[, col_order]
+ return(counts)
+ }
> counts = normalize_bulk_counts(combined)
>
> umi_col_order = order(colnames(umi))
> umi = umi[, umi_col_order]
>
> make_bulk_heatmap = function(counts, gene_list, cexRow = 0.6, cexCol = 0.6,
+ title = "") {
+ gene_list = unique(gene_list[, c("id", "symbol")])
+ rownames(gene_list) = gene_list$id
+ mat = as.matrix(subset(counts, rownames(counts) %in% gene_list$id))
+ mat = as.matrix(mat)
+ mat = sqrt(mat)
+
+ rownames(mat) = gene_list[rownames(mat), ]$symbol
+ pheatmap(mat, col = grey(seq(1, 0, -0.01)), cluster_cols = FALSE, cluster_rows = FALSE,
+ main = title)
+ }
>
> make_singlecell_heatmap = function(counts, gene_list, cexRow = 0.6, cexCol = 0.6,
+ title = "") {
+ gene_list = unique(gene_list[, c("id", "symbol")])
+ rownames(gene_list) = gene_list$id
+ mat = as.matrix(subset(counts, rownames(counts) %in% gene_list$id))
+ mat = as.matrix(mat)
+ mat = sqrt(mat)
+ rownames(mat) = gene_list[rownames(mat), ]$symbol
+ pheatmap(mat, col = grey(seq(1, 0, -0.01)), cluster_cols = FALSE, show_colnames = FALSE,
+ cluster_rows = FALSE, main = title)
+ }
> make_bulk_heatmap(counts, rod_gene_list, title = "Square root of bulk counts for rod genes")
> make_singlecell_heatmap(umi, rod_gene_list, title = "Square root of single cell counts for rod genes")
> make_bulk_heatmap(counts, apoptosis_list, title = "Square root of bulk counts for apoptosis genes")
> make_singlecell_heatmap(umi, apoptosis_list, title = "Square root of single cell counts for apoptosis genes")
> make_bulk_heatmap(counts, housekeeping_list, title = "Square root of bulk counts for housekeeping genes")
> make_singlecell_heatmap(umi, housekeeping_list, title = "Square root of single cell counts for housekeeping genes")
> make_bulk_heatmap(counts, glycolysis_list, title = "Square root of bulk counts for glycolysis genes")
> make_singlecell_heatmap(umi, glycolysis_list, title = "Square root of single cell counts for glycolysis genes")
> make_bulk_heatmap(counts, calcium_list, title = "Square root of bulk counts for calcium genes")
> make_singlecell_heatmap(umi, calcium_list, title = "Square root of single cell counts for calcium genes")
> make_bulk_heatmap(counts, nrf2_list, title = "Square root of bulk counts for nrf2 genes")
> make_singlecell_heatmap(umi, nrf2_list, title = "Square root of single cell counts for nrf2 genes")
> make_bulk_heatmap(counts, hif1_list, title = "Square root of bulk counts for hif1 genes")
> make_singlecell_heatmap(umi, hif1_list, title = "Square root of single cell counts for hif1 genes")
> make_bulk_heatmap(counts, oxidative_list, title = "Square root of bulk counts for oxidative genes")
> make_singlecell_heatmap(umi, oxidative_list, title = "Square root of single cell counts for oxidative genes")
> write_tables = function(l, bulk, sc, name) {
+ write.table(subset(bulk, id %in% l$id), file = paste(name, "_bulk.tsv",
+ sep = ""), quote = FALSE, col.names = TRUE)
+ write.table(subset(sc, rownames(sc) %in% l$id), file = paste(name, "_singlecell.tsv",
+ sep = ""), quote = FALSE, col.names = TRUE)
+ }
> counts_table = as.data.frame(counts)
> counts_table$id = rownames(counts)
> m = merge(counts_table, combined[, c("id", "symbol")], by = "id", all.x = TRUE)
> write.table(m, file = "bulk_counts.tsv", sep = "\t", quote = FALSE, col.names = TRUE)
> umi_table = as.data.frame(umi)
> umi_table$id = rownames(umi)
> m = merge(umi_table, combined[, c("id", "symbol")], by = "id", all.x = TRUE)
> write.table(m, file = "umi_counts.tsv", sep = "\t", quote = FALSE, col.names = TRUE)